Name: The brand and model of the car.
Location: The location in which the car is being sold or is available for purchase.
Year: The year or edition of the model.
Kilometers_Driven: The total kilometres driven in the car by the previous owner(s) in KM.
Fuel_Type: The type of fuel used by the car.
Transmission: The type of transmission used by the car.
Owner_Type: Whether the ownership is Firsthand, Second hand or other.
Mileage: The standard mileage offered by the car company in kmpl or km/kg
Engine: The displacement volume of the engine in cc.
Power: The maximum power of the engine in bhp.
Seats: The number of seats in the car.
New_Price: The price of a new car of the same model.
Price: The price of the used car in INR Lakhs.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
Train=pd.read_excel('Data_Train.xlsx')
Test=pd.read_excel('Data_Test.xlsx')
Train.head()
Test.shape ,Train.shape
Train.duplicated().sum()
Test_copy=Test.copy()
Test[Test.duplicated()]
Test[Test['Name'].str.contains("Honda City 1.5 E MT")]
#removing unit of Mileage column
for Unit in ['km/kg','kmpl']:
Train.Mileage = Train.Mileage.str.replace(Unit,' ')
Train.Mileage=Train.Mileage.astype(float)
#creating new columns
Train['Mileage_per_kg']=[0.0 for i in range(len(Train.Mileage))]
Train['Mileage_per_l']=[0.0 for i in range(len(Train.Mileage))]
Train
for i in range(len(Train)):
if (Train['Fuel_Type'][i]=='CNG'):
Train['Mileage_per_kg'][i]=Train['Mileage'][i]
if (Train['Fuel_Type'][i]=='LPG'):
Train['Mileage_per_kg'][i]=Train['Mileage'][i]
if (Train['Fuel_Type'][i]=='Petrol'):
Train['Mileage_per_l'][i]=Train['Mileage'][i]
if (Train['Fuel_Type'][i]=='Diesel'):
Train['Mileage_per_l'][i]=Train['Mileage'][i]
if (Train['Fuel_Type'][i]=='Electric'):
Train['Mileage_per_l'][i]=0
Train['Mileage_per_kg'][i]=0
Train['Mileage_per_kg']=Train['Mileage_per_kg'].astype(float)
Train['Mileage_per_l']=Train['Mileage_per_l'].astype(float)
Train
Train['Fuel_Type'].unique()
Test['Fuel_Type'].unique()
Test.info()
#removing unit of Mileage column in test data
for Unit in ['km/kg','kmpl']:
Test.Mileage = Test.Mileage.str.replace(Unit,' ')
Test.Mileage=Test.Mileage.astype(float)
Test
#creating new columns in test data
Test['Mileage_per_kg']=[0.0 for i in range(len(Test.Mileage))]
Test['Mileage_per_l']=[0.0 for i in range(len(Test.Mileage))]
Test
for i in range(len(Test)):
if (Test['Fuel_Type'][i]=='CNG'):
Test['Mileage_per_kg'][i]=Test['Mileage'][i]
if (Test['Fuel_Type'][i]=='LPG'):
Test['Mileage_per_kg'][i]=Test['Mileage'][i]
if (Test['Fuel_Type'][i]=='Petrol'):
Test['Mileage_per_l'][i]=Test['Mileage'][i]
if (Test['Fuel_Type'][i]=='Diesel'):
Test['Mileage_per_l'][i]=Test['Mileage'][i]
if (Test['Fuel_Type'][i]=='Electric'):
Test['Mileage_per_l'][i]=0
Test['Mileage_per_kg'][i]=0
Test['Mileage_per_kg']=Test['Mileage_per_kg'].astype(float)
Test['Mileage_per_l']=Test['Mileage_per_l'].astype(float)
Test
#removing units from Mileage,Power,Engine
Train['Engine']=Train['Engine'].str.strip('CC')
Train['Power']=Train['Power'].str.strip('bhp')
Train
Test
#removing Units from Test data
Test['Engine']=Test['Engine'].str.strip('CC')
Test['Power']=Test['Power'].str.strip('bhp')
Test
#Spliting Name
Train['Brand']=Train['Name'].str.split(' ').str[0]
Train['Model']=Train['Name'].str.split(' ').str[1]
Train['Version']=Train['Name'].str.split(' ').str[2:7].str.join(" ")
Train
#Spliting Name in Test Data
Test['Brand']=Test['Name'].str.split(' ').str[0]
Test['Model']=Test['Name'].str.split(' ').str[1]
Test['Version']=Test['Name'].str.split(' ').str[2:7].str.join(" ")
Test
Dealing with null/missing values
Train.info()
Test.info()
Train.isnull().sum()
#Imputing Null Values in Power
for i in range(len(Train)):
if (Train['Power'][i]=='null '):
Train['Power'][i]=0.0
Train['Power']=Train['Power'].astype(float)
for i in range(len(Test)):
if (Test['Power'][i]=='null '):
Test['Power'][i]=0.0
Test['Power']=Test['Power'].astype(float)
#set of columns which contains null values
Null_column_Train = ["Mileage","Engine","Power","Seats"]
Null_column_Test = ["Engine","Power","Seats"]
# Defining a function for Replacing null values
def Null_replacing_Fun(dataset , Null_column):
for i in range(len(Null_column)):
dataset[Null_column[i]].fillna(dataset[Null_column[i]].mode()[0] , inplace = True)
print(dataset)
#calling funtion to null replacing for Train Data
Null_replacing_Fun(Train , Null_column_Train)
Train.info()
Train.isnull().sum()
#calling function for Test data set
Null_replacing_Fun(Test , Null_column_Test)
Test.isnull().sum()
Test.info()
Test
Test.head()
#creating Age of car column
Train['Age']=2020-Train['Year']
Train
#creating Age of car column
Test['Age']=2020-Test['Year']
Test
Train['Name'].unique()
Test['Name'].unique()
Train[Train['Mileage'].isnull()]
Train[Train['Power'].isnull()]
Train['Name'].value_counts()
Test['Name'].value_counts()
#removing outliners
print(Train[Train['Seats']<2])
Train.drop(Train[Train['Seats']<2].index,inplace= True)
Train[Train['Seats']<2]
Train[Train['Price']>100]
#Droping Outliners
Train=Train.drop(Train[Train['Price']>100].index)
Train.info()
Train.describe()
Train[Train['Kilometers_Driven']>800000]
Train=Train.drop(Train[Train['Kilometers_Driven']>800000].index)
Train['Year'].value_counts().plot.bar()
Train['Fuel_Type'].value_counts().plot.bar()
Train['Owner_Type'].value_counts().plot.bar()
Train['Location'].value_counts().plot.bar()
Train.Price.sort_values(ascending=False)
Train.Price.mean()
Train[Train.Price>9.47]
Train.Mileage.sort_values()
pd.crosstab(Train.Location, Train.Owner_Type)
Train['Location'].value_counts(normalize=True)
sns.distplot(Train["Year"])
sns.distplot(Train["Price"])
plt.figure(figsize=(20,5))
sns.boxplot(
data=Train,
x='Fuel_Type',y='Price',
)
#Price of car vs Locations
plt.figure(figsize=(15,10))
sns.boxplot(x=Train['Location'],y=Train['Price'],data=Train,hue=Train['Fuel_Type'])
#No. of cars from different Brands
plt.figure(figsize = (12, 8))
plot = sns.countplot(x = 'Brand', data = Train)
plt.xticks(rotation = 90)
for p in plot.patches:
plot.annotate(p.get_height(),
(p.get_x() + p.get_width() / 2.0,
p.get_height()),
ha = 'center',
va = 'center',
xytext = (0, 5),
textcoords = 'offset points')
plt.title("Count of cars based on Brands")
plt.xlabel("Car Brand")
plt.ylabel("Count of cars")
plt.figure(figsize=(20,5))
sns.boxplot(
data=Train,
x='Transmission',y='Price',
)
plt.figure(figsize=(20,5))
sns.boxplot(
data=Train,
x='Owner_Type',y='Price',
color='red')
plt.figure(figsize=(20,5))
sns.boxplot(
data=Train,
x='Location',y='Price',
color='red')
sns.jointplot(x='Price',y='Kilometers_Driven',data=Train,kind='hex')
sns.set(style="white", palette="muted", color_codes=True)
Train['Year'].hist(bins=30)
Train['Mileage'].hist(bins=30)
sns.violinplot(y='Price',data=Train)
sns.boxplot(y='Kilometers_Driven',data=Train)
sns.jointplot(x='Price',y='Kilometers_Driven',data=Train)
print("Minimum year in the data is:",Train.Year.min())
print("Maximum year in the data is:",Train.Year.max())
print("Range of year is from {} to {},value is{}".format(Train.Year.max(),Train.Year.min(),Train.Year.max()-Train.Year.min()))
print("Default bin is 50,so each bar corresponds to the value of:",(Train.Year.max()-Train.Year.min())/50)
plt.figure(figsize=(20,10))
sns.barplot(x="Engine",y="Price",data=Train)
plt.figure(figsize=(20,10))
sns.scatterplot(x='Kilometers_Driven',y='Price',data=Train, hue='Brand')
plt.xlim(0, 350000)
plt.figure(figsize=(20,10))
sns.scatterplot(x='Kilometers_Driven',y='Price',data=Train, hue='Fuel_Type')
plt.xlim(0, 350000)
plt.figure(figsize=(20,10))
sns.scatterplot(x='Kilometers_Driven',y='Price',data=Train, hue='Location')
plt.xlim(0, 350000)
plt.figure(figsize=(10,30))
sns.boxplot(
data=Train,
x='Price',y='Brand')
plt.figure(figsize=(10,5))
sns.boxplot(
data=Train,
x='Kilometers_Driven')
plt.figure(figsize=(10,10))
sns.boxplot(
data=Train,
x='Price')
plt.figure(figsize=(10,10))
sns.boxplot(
data=Train,
x='Kilometers_Driven')
pd.crosstab(Train.Name,Train.Location)
pd.crosstab(Train.Name,Train.Fuel_Type)
pd.crosstab(Train.Name, Train.Owner_Type)
Train.Kilometers_Driven=(Train.Kilometers_Driven-Train.Kilometers_Driven.min())/(Train.Kilometers_Driven.max()-Train.Kilometers_Driven.min())
Train.Kilometers_Driven
Test.Kilometers_Driven=(Test.Kilometers_Driven-Test.Kilometers_Driven.min())/(Test.Kilometers_Driven.max()-Test.Kilometers_Driven.min())
Test.Kilometers_Driven
scaler = MinMaxScaler()
num_vars = ['Mileage', 'Power', 'Engine','Mileage_per_kg','Mileage_per_l']
Train[num_vars] = scaler.fit_transform(Train[num_vars])
Train
scaler = MinMaxScaler()
num_vars = ['Mileage', 'Power', 'Engine','Mileage_per_kg','Mileage_per_l']
Test[num_vars] = scaler.fit_transform(Test[num_vars])
Test
corrmat = Train.corr()
f, ax = plt.subplots(figsize=(7, 7))
sns.heatmap(corrmat, vmax=.8, square=True);
corr = Train.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))
sns.lmplot(x='Year', y='Price', data=Train)
plt.figure(figsize = (10, 6))
sns.boxplot(data=Train)
Train.boxplot(column=["Age","Seats"])
Test.info()
sns.boxplot(x=Train['Age'],)
Train[Train['Seats'].isnull()]
x = Train['Price']
plt.figure(figsize=(10,6))
sns.distplot(x).set_title('Frequency Distribution Plot of Prices')
print(Train['Price'].mean())
print(Train['Price'].median())
print(Train['Price'].mode())
Reggression
plt.figure(figsize=(20,8))
sns.regplot(x='Age', y='Price',data=Train).set_title('Age vs Price')
Train = Train[Train.Kilometers_Driven < 400000]
plt.figure(figsize=(10,6))
sns.regplot(x='Kilometers_Driven', y='Price', data=Train).set_title('Km vs Price')
plt.figure(figsize=(20,15))
sns.boxplot(data=Train, x='Location',y='Price')
plt.figure(figsize=(20,10))
sns.boxplot(data=Train,x='Fuel_Type',y='Price')
plt.figure(figsize=(20,5))
plt.subplot(1,5,1)
x=Train['Location'].value_counts().index
y=Train['Location'].value_counts().values
plt.pie(y,labels=x,autopct='%1.1f%%')
plt.title("Location distribution")
plt.subplot(1,5,2)
x=Train['Owner_Type'].value_counts().index
y=Train['Owner_Type'].value_counts().values
plt.pie(y,labels=x,autopct='%1.1f%%')
plt.title("Owner Type distribution")
plt.subplot(1,5,3)
x=Train['Fuel_Type'].value_counts().index
y=Train['Fuel_Type'].value_counts().values
plt.pie(y,labels=x,autopct='%1.1f%%')
plt.title("Fuel_Type distribution")
plt.subplot(1,5,4)
x=Train['Transmission'].value_counts().index
y=Train['Transmission'].value_counts().values
plt.pie(y,labels=x,autopct='%1.1f%%')
plt.title("Transmission distribution")
plt.subplot(1,5,5)
x=Train['Seats'].value_counts().index
y=Train['Seats'].value_counts().values
plt.pie(y,labels=x,autopct='%1.1f%%')
plt.title("Seats distribution")
sns.pairplot(Train)
sns.set(font_scale = 1)
sns.relplot(x = "Year" , y = "Price" , data = Train , hue = "Fuel_Type" ,kind ="line", col = "Owner_Type")
sns.relplot(x = "Year" , y = "Price" , data = Train , hue = "Fuel_Type" ,
size = "Transmission", sizes=(20,200))
sns.relplot(x = "Year" , y = "Price" , data = Train , hue = "Owner_Type", kind = "line" )
Encoding the categorical variables
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
New_train=Train.copy()
New_test=Test.copy()
New_train= pd.concat([New_train,pd.get_dummies(New_train['Location'], prefix='Location').astype(float)],axis=1)
New_test= pd.concat([New_test,pd.get_dummies(New_test['Location'], prefix='Location').astype(float)],axis=1)
Train
New_train= pd.concat([New_train,pd.get_dummies(New_train['Brand'], prefix='Br').astype(float)],axis=1)
New_test= pd.concat([New_test,pd.get_dummies(New_test['Brand'], prefix='Br').astype(float)],axis=1)
Train
lben = LabelEncoder()
New_train['Fuel_Type'] = lben.fit_transform(New_train['Fuel_Type']).astype(float)
New_train['Owner_Type'] = lben.fit_transform(New_train['Owner_Type']).astype(float)
New_train['Transmission'] = lben.fit_transform(New_train['Transmission']).astype(float)
lben = LabelEncoder()
New_test['Fuel_Type'] = lben.fit_transform(New_test['Fuel_Type']).astype(float)
New_test['Owner_Type'] = lben.fit_transform(New_test['Owner_Type']).astype(float)
New_test['Transmission'] = lben.fit_transform(New_test['Transmission']).astype(float)
New_test
New_train
New_train=New_train.drop(['Year'],axis=1)
New_test=New_test.drop(['Year'],axis=1)
New_train.corr()
plt.figure(figsize= (15,10))
Formodel = Train[['Year','Kilometers_Driven','Fuel_Type','Transmission','Owner_Type','Power','Engine','Seats','Age','Mileage_per_l','Mileage_per_kg' ]]
sns.heatmap(Formodel.corr(), annot=True)
plt.figure(figsize= (20,20))
sns.heatmap(New_train.corr(), annot=True)
#creating new missing columns in train data
New_train['Br_Hindustan']=[0.0 for row in New_train.index]
New_train['Br_OpelCorsa']=[0.0 for row in New_train.index]
#Rearanging new missing columns in train data
mid = New_test['Br_Hindustan']
New_train.drop(labels=['Br_Hindustan'], axis=1,inplace = True)
New_train.insert(37, 'Br_Hindustan', mid)
mid = New_test['Br_OpelCorsa']
New_train.drop(labels=['Br_OpelCorsa'], axis=1,inplace = True)
New_train.insert(51, 'Br_OpelCorsa', mid)
New_train['Br_Hindustan']=[0.0 for row in New_train.index]
New_train['Br_OpelCorsa']=[0.0 for row in New_train.index]
New_train.info()
#creating missing columns intest data
New_test['Br_Ambassador']=[0.0 for row in New_test.index]
New_test['Br_Force']=[0.0 for i in New_test.index]
#Rearanging new missing columns in train data
mid = New_train['Br_Ambassador']
New_test.drop(labels=['Br_Ambassador'], axis=1,inplace = True)
New_test.insert(27, 'Br_Ambassador', mid)
mid = New_train['Br_Force']
New_test.drop(labels=['Br_Force'], axis=1,inplace = True)
New_test.insert(34, 'Br_Force', mid)
New_test['Br_Ambassador']=[0.0 for row in New_test.index]
New_test['Br_Force']=[0.0 for i in New_test.index]
New_test.info()
X=New_train.drop(['Price','Name','Brand','Location','Model','Version','Mileage',"Br_Smart"],axis=1)
y=New_train['Price']
X['Age']=X['Age'].astype(float)
X['Kilometers_Driven']=X['Kilometers_Driven'].astype(float)
X['Engine']=X['Engine'].astype(float)
X['Power']=X['Power'].astype(float)
X.info()
New_test=New_test.drop(['Name','Brand','Location','Model','Version','Mileage'],axis=1)
New_test['Age']=New_test['Age'].astype(float)
New_test['Kilometers_Driven']=New_test['Kilometers_Driven'].astype(float)
New_test['Engine']=New_test['Engine'].astype(float)
New_test['Power']=New_test['Power'].astype(float)
New_test
New_test.info()
Cross validation
#Function for Predicting accuracy
def Scr(Algo):
scr=Algo.score(X_test,y_test)*100
return (scr)
#k-fold cross-validation
def k_fold(model):
k_folds = model_selection.KFold(n_splits=5, shuffle=False)
scores = model_selection.cross_val_score(model, X_test, y_test, cv=k_folds, scoring='r2')
Avg_Score=np.mean(scores)
return(Avg_Score)
#function for root mean square
def rmsle(y_pred):
return np.sqrt(mean_squared_error(y_test,y_pred))
Splitting Data
#splitting the train-test data for validation
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=123)
X_test.info()
Starting With Linear regresion model
#linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
#Predicting the test set results
y_pred_ls = linreg.predict(X_test)
print(linreg.score(X_test, y_test)*100,'% Prediction Accuracy')
GBR Model
#GBR
gbr = GradientBoostingRegressor(loss ='ls', max_depth=6)
gbr.fit (X_train, y_train)
# get the predicted values from the test set
y_pred_gbr= gbr.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, y_pred_gbr))
print(gbr.score(X_test, y_test)*100,'% Prediction Accuracy')
XBG Model
#XGB
model_xgb = xgb.XGBRegressor(colsample_bytree=0.52, gamma=0.03,
learning_rate=0.072, max_depth=6,
min_child_weight=2, n_estimators=2200,
reg_alpha=0, reg_lambda=1,
subsample=0.615, silent=1,
random_state =7, nthread = -1)
model_xgb.fit(X_train,y_train)
xg_t_pre= model_xgb.predict(X_train)
xg_pre=(model_xgb.predict(X_test))
y_predr = model_xgb.predict(X_test)
print(model_xgb.score(X_test, y_test)*100,'% Prediction Accuracy')
Random Forest Model
RF=RandomForestRegressor()
RF.fit(X_train,y_train)
predic = RF.predict(X_test)
y_pred_RF = RF.predict(X_test)
print(RF.score(X_test, y_test)*100,'% Prediction Accuracy')
Lasso Model
#lasso
ls=Lasso()
ls.fit(X_train,y_train)
ls_predic = ls.predict(X_test)
print(ls.score(X_test, y_test)*100,'% Prediction Accuracy')
LGB Model
lgb_model = lgb.LGBMRegressor(categorical_feature= [0,2,3,4,8,9,10],task = 'predict',application = 'regression',
objective = 'root_mean_squared_error',boosting_type="gbdt",num_iterations = 2500,
learning_rate = 0.05,num_leaves=15,tree_learner='feature',max_depth =10,min_data_in_leaf=7,
bagging_fraction = 1,bagging_freq = 100,reg_sqrt='True',metric ='rmse',feature_fraction = 0.6,
random_state=42)
lgb_model.fit(X_train,y_train)
lgb_model_pred = lgb_model.predict(X_test)
print('model ','Score ','rmsle','k-fold' )
print('linreg',round(Scr(linreg),4),round(rmsle(y_pred_ls),4),round(k_fold(linreg),4))
print('gbr ',round(Scr(gbr),4),round(rmsle(y_pred_gbr),4),round(k_fold(gbr),4))
print('xgb ',round(Scr(model_xgb),4),round(rmsle(y_predr),4),round(k_fold(model_xgb),4))
print('RF ',round(Scr(RF),4),round(rmsle(y_pred_RF),4),round(k_fold(RF),4))
print('ls ',round(Scr(ls),4),round(rmsle(ls_predic),4),round(k_fold(ls),4))
print('lgb ',round(Scr(lgb_model),4),round(rmsle(lgb_model_pred),4),round(k_fold(lgb_model),4))
GBR ,RF and LGB are doing good but XGB Seems the best one.
So Using XGB for final Prediction
prediction =model_xgb.predict(New_test)
sol = pd.DataFrame({'Price': prediction })
sol = round(sol['Price'],2)
Test_copy['Prediction']=prediction
Test_copy['Prediction']=Test_copy['Prediction'].round(2)
writer = pd.ExcelWriter('Test_Final.xlsx', engine='xlsxwriter')
Test_copy.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()
Test_copy.head(50)